!pip install -q klib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import re
import klib
from google.colab import drive
drive.mount('/content/gdrive')
train=pd.read_csv("/content/gdrive/MyDrive/AV_job-a-thon-august-2022/train_F3fUq2S.csv")
test=pd.read_csv("/content/gdrive/MyDrive/AV_job-a-thon-august-2022/test_Bk2wfZ3.csv")
train.info()
test.info()
train.isnull().sum()
test.isnull().sum()
train.head(2)
def count_plot(df,col,rot=None,switchax=None,size=(8,6),title=''):
_=plt.figure(figsize=size)
if switchax:
_=sns.countplot(y=df[col],order=df[col].value_counts().index);
_=plt.title(title,fontsize=25)
_=plt.ylabel(col,fontsize=20)
_=plt.yticks(fontsize=14,rotation=rot)
else:
_=sns.countplot(x=df[col],order=df[col].value_counts().index);
_=plt.title(title,fontsize=25)
_=plt.xlabel(col,fontsize=20)
_=plt.xticks(fontsize=14,rotation=rot)
def grp_count_plot(df,col,grp,rot=None,switchax=None,size=(10,8),title=''):
_=plt.figure(figsize=size)
if switchax:
g=sns.countplot(y=df[col],hue=df[grp],order=df[col].value_counts().index );
g.legend(loc='center right', bbox_to_anchor=(1.25, 0.5), ncol=1)
_=plt.title(title,fontsize=25)
_=plt.ylabel(col.capitalize(),fontsize=20)
_=plt.yticks(fontsize=14,rotation=rot)
else:
g=sns.countplot(x=df[col],hue=df[grp],order=df[col].value_counts().index );
g.legend(loc='center right', bbox_to_anchor=(1.25, 0.5), ncol=1)
_=plt.title(title,fontsize=25)
_=plt.xlabel(col.capitalize(),fontsize=20)
_=plt.xticks(fontsize=14,rotation=rot)
def box_plot(df,col,rot=None):
_=plt.figure(figsize=(8,6))
_=sns.boxplot(y=df[col])
_=plt.title(col.capitalize()+" Distribution",fontsize=25)
_=plt.ylabel(col,fontsize=20,rotation=rot)
_=plt.yticks(fontsize=14)
def side_by_side_plot(df,grp,valcol,rot=None):
clr="Paired"
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,8))
fig.tight_layout()
sns.kdeplot(x=df[valcol], hue=df[grp],ax=ax1,palette=clr)
ax1.set_title(grp.capitalize()+" Wise "+valcol.capitalize()+" Distribution",size=15)
ax1.set_xlabel(valcol,fontsize=20)
sns.boxplot(x=df[grp],y=df[valcol],ax=ax2,palette=clr)
ax2.set_title(grp.capitalize()+" Wise "+valcol.capitalize()+" Distribution",size=15)
ax2.set_xlabel(grp,fontsize=20)
ax2.tick_params(rotation=rot)
def group_summary(df,groupcol,value):
return df.groupby(groupcol)[value].describe().reset_index().sort_values('mean',ascending=False)
def group_by_perc(df,grp,target):
return df.groupby([grp,target])[target].agg({'count'}).reset_index().assign(percentage=lambda x:round(x['count']*100/x['count'].sum(),2)).sort_values('percentage',ascending=False)
klib.dist_plot(train['click_rate']);
plt.title("Email Campaigning Click Rate Distribution",fontsize=20);
train['click_rate'].describe()
box_plot(train,'click_rate',rot=90)
count_plot(train,'sender' ,title='Various Sender')
train['sender'].value_counts()
side_by_side_plot(train,'sender','click_rate')
group_summary(train,'sender','click_rate')
train['subject_len'].nunique()
klib.dist_plot(train['subject_len']);
plt.title("Subject Length Distribution",fontsize=20);
train['subject_len'].describe()
box_plot(train,'subject_len',rot=90)
sns.lmplot(data=train, x="subject_len", y="click_rate",hue='sender',
col='sender',col_wrap=4,sharex=False,sharey=False,palette='Paired');
side_by_side_plot(train,'sender','subject_len')
train['mean_paragraph_len'].nunique()
klib.dist_plot(train['mean_paragraph_len']);
plt.title("Average Paragraph Length Distribution",fontsize=20);
train['mean_paragraph_len'].describe()
box_plot(train,'mean_paragraph_len',rot=90)
sns.lmplot(data=train, x="mean_paragraph_len", y="click_rate",hue='sender',
col='sender',col_wrap=4,sharex=False,sharey=False,palette='Paired');
side_by_side_plot(train,'sender','subject_len')
days = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
train['day_of_week']=train['day_of_week'].map(days)
count_plot(train,'day_of_week' ,title='Day of Week',switchax=True)
train['day_of_week'].value_counts()
plt.figure(figsize=(10,6))
sns.lineplot(data=train,x='day_of_week',y='click_rate');
side_by_side_plot(train,'day_of_week','click_rate')
group_summary(train,'day_of_week','click_rate')
sns.catplot(data=train.groupby(['sender','day_of_week'])['day_of_week'].agg({'count'}).reset_index(),
x='sender',y='count',col='day_of_week',
col_wrap=4,height=3, aspect=.8,
col_order=list(days.values()),
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','day_of_week'])['sender'].agg({'count'}).reset_index().groupby(['day_of_week'])['sender'].agg({'count'})
count_plot(train,'is_weekend' ,title='Weekend')
train['is_weekend'].value_counts()
side_by_side_plot(train,'is_weekend','click_rate')
group_summary(train,'is_weekend','click_rate')
sns.catplot(data=train.groupby(['sender','is_weekend'])['is_weekend'].agg({'count'}).reset_index(),
x='sender',y='count',col='is_weekend',
height=4, aspect=.8,
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','is_weekend'])['sender'].agg({'count'}).reset_index().groupby(['is_weekend'])['sender'].agg({'count'})
count_plot(train,'times_of_day' ,title='Time of the Day',switchax=True)
train['times_of_day'].value_counts()
sns.relplot(data=train,x='times_of_day',y='click_rate',sort=True,
hue='day_of_week',col='day_of_week',col_wrap=4,
col_order=list(days.values()),
kind="line",
);
side_by_side_plot(train,'times_of_day','click_rate')
group_summary(train,'times_of_day','click_rate')
sns.catplot(data=train.groupby(['sender','times_of_day'])['times_of_day'].agg({'count'}).reset_index(),
x='sender',y='count',col='times_of_day',
col_wrap=3,height=3, aspect=.8,
col_order=['Morning','Noon','Evening'],
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','times_of_day'])['sender'].agg({'count'}).reset_index().groupby(['times_of_day'])['sender'].agg({'count'})
count_plot(train,'category' ,title='Category of Product')
train['category'].value_counts()
sns.relplot(data=train,x='category',y='click_rate',sort=True,
hue='day_of_week',col='day_of_week',
col_wrap=4,col_order=list(days.values()),
kind="line",
);
sns.relplot(data=train,x='category',y='click_rate',sort=True,
hue='times_of_day',col='times_of_day',
col_wrap=3,col_order=['Morning','Noon','Evening'],
kind="line",
);
side_by_side_plot(train,'category','click_rate')
group_summary(train,'category','click_rate')
sns.catplot(data=train.groupby(['category','sender'])['sender'].agg({'count'}).reset_index(),
x='category',y='count',col='sender',
col_wrap=4,height=4, aspect=.7,
#col_order=['Morning','Noon','Evening'],
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','category'])['sender'].agg({'count'}).reset_index().groupby(['category'])['sender'].agg({'count'}).reset_index().sort_values('count',ascending=False)
count_plot(train,'product' ,title='Product Types',size=(15,6))
train['product'].value_counts()
train['no_of_CTA'].nunique()
klib.dist_plot(train['no_of_CTA']);
plt.title("Call to Action Distribution",fontsize=20);
train['no_of_CTA'].describe()
box_plot(train,'no_of_CTA',rot=90)
sns.lmplot(data=train, x="no_of_CTA", y="click_rate",hue='sender',
col='sender',col_wrap=4,sharex=False,sharey=False,palette='Paired');
sns.lmplot(data=train, x="no_of_CTA", y="click_rate",hue='day_of_week',
col='day_of_week',col_wrap=4,sharex=False,sharey=False,palette='Paired',
col_order=list(days.values()));
side_by_side_plot(train,'day_of_week','no_of_CTA')
sns.lmplot(data=train, x="no_of_CTA", y="click_rate",hue='times_of_day',
col='times_of_day',col_wrap=3,sharex=False,sharey=False,palette='Paired',
col_order=['Morning','Noon','Evening']);
side_by_side_plot(train,'times_of_day','no_of_CTA')
sns.lmplot(data=train, x="no_of_CTA", y="click_rate",hue='product',
col='product',col_wrap=3,sharex=False,sharey=False,palette='Paired',
);
train['mean_CTA_len'].nunique()
klib.dist_plot(train['mean_CTA_len']);
plt.title("Average number of characters in CTA",fontsize=20);
train['mean_CTA_len'].describe()
box_plot(train,'mean_CTA_len',rot=90)
sns.lmplot(data=train, x="no_of_CTA", y="mean_CTA_len",palette='Paired');
sns.lmplot(data=train, x="mean_CTA_len", y="click_rate",hue='sender',
col='sender',col_wrap=4,sharex=False,sharey=False,palette='Paired',
);
sns.lmplot(data=train, x="mean_CTA_len", y="click_rate",hue='day_of_week',
col='day_of_week',col_wrap=4,sharex=False,sharey=False,palette='Paired',
col_order=list(days.values()));
side_by_side_plot(train,'day_of_week','mean_CTA_len')
sns.lmplot(data=train, x="mean_CTA_len", y="click_rate",hue='times_of_day',
col='times_of_day',col_wrap=3,sharex=False,sharey=False,palette='Paired',
col_order=['Morning','Noon','Evening']);
side_by_side_plot(train,'times_of_day','mean_CTA_len')
sns.lmplot(data=train, x="mean_CTA_len", y="click_rate",hue='product',
col='product',col_wrap=3,sharex=False,sharey=False,palette='Paired',
);
count_plot(train,'is_image' ,title='Number of Images in Email')
train['is_image'].value_counts()
side_by_side_plot(train,'is_image','click_rate')
group_summary(train,'is_image','click_rate')
sns.catplot(data=train.groupby(['is_image','sender'])['sender'].agg({'count'}).reset_index(),
x='is_image',y='count',col='sender',
col_wrap=5,height=4, aspect=.7,
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','is_image'])['is_image'].agg({'count'}).reset_index().groupby(['sender'])['is_image'].agg({'count'}).reset_index().sort_values('count',ascending=False)
count_plot(train,'is_personalised' ,title='User Personalized Email')
train['is_personalised'].value_counts()
side_by_side_plot(train,'is_personalised','click_rate')
group_summary(train,'is_personalised','click_rate')
count_plot(train,'is_quote' ,title='Number of Quotes in Email')
train['is_quote'].value_counts()
side_by_side_plot(train,'is_quote','click_rate')
group_summary(train,'is_quote','click_rate')
sns.catplot(data=train.groupby(['is_quote','sender'])['sender'].agg({'count'}).reset_index(),
x='is_quote',y='count',col='sender',
col_wrap=5,height=4, aspect=.7,
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','is_quote'])['is_quote'].agg({'count'}).reset_index().groupby(['sender'])['is_quote'].agg({'count'}).reset_index().sort_values('count',ascending=False)
count_plot(train,'is_timer' ,title='Timer in Email')
train['is_timer'].value_counts()
count_plot(train,'is_emoticons' ,title='Emoticons in Email')
train['is_emoticons'].value_counts()
side_by_side_plot(train,'is_emoticons','click_rate')
group_summary(train,'is_emoticons','click_rate')
sns.catplot(data=train.groupby(['is_emoticons','sender'])['sender'].agg({'count'}).reset_index(),
x='is_emoticons',y='count',col='sender',
col_wrap=5,height=4, aspect=.7,
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','is_emoticons'])['is_emoticons'].agg({'count'}).reset_index().groupby(['sender'])['is_emoticons'].agg({'count'}).reset_index().sort_values('count',ascending=False)
count_plot(train,'is_discount' ,title='Discount in Email')
train['is_discount'].value_counts()
side_by_side_plot(train,'is_discount','click_rate')
group_summary(train,'is_discount','click_rate')
train['is_price'].unique()
train['is_price'].value_counts()
train['is_price']=(train['is_price'] > 0).astype('int')
count_plot(train,'is_price' ,title='Price in Email')
train['is_price'].value_counts()
side_by_side_plot(train,'is_price','click_rate')
group_summary(train,'is_price','click_rate')
count_plot(train,'is_urgency' ,title='Urgency in Email')
train['is_urgency'].value_counts()
side_by_side_plot(train,'is_urgency','click_rate')
group_summary(train,'is_urgency','click_rate')
count_plot(train,'target_audience' ,title='Target Audience Clusters')
train['target_audience'].value_counts()
side_by_side_plot(train,'target_audience','click_rate')
group_summary(train,'target_audience','click_rate')
sns.catplot(data=train.groupby(['target_audience','sender'])['sender'].agg({'count'}).reset_index(),
x='target_audience',y='count',col='sender',
col_wrap=4,height=4, aspect=.7,
sharex=False,sharey=False,kind='bar',palette='Paired');
train.groupby(['sender','target_audience'])['target_audience'].agg({'count'}).reset_index().groupby(['sender'])['target_audience'].agg({'count'}).reset_index().sort_values('count',ascending=False)